library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.0     v forcats 0.5.1
## v purrr   0.3.4
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%()       masks ggplot2::%+%()
## x psych::alpha()     masks ggplot2::alpha()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x dplyr::src()       masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(skimr)
## Warning: package 'skimr' was built under R version 4.0.5
library(purrr)
library(tidyr)
library(tidyverse)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
dfTrain <- read.csv("D:\\RStudio\\621\\Baseball\\moneyball-training-data.csv", header=TRUE)
dfEval <- read.csv("D:\\RStudio\\621\\Baseball\\moneyball-evaluation-data.csv", header=TRUE)

colnames(dfTrain)<-gsub("TEAM_","",colnames(dfTrain))

dfTrain2 <- dfTrain
summary(dfTrain)
##      INDEX         TARGET_WINS       BATTING_H      BATTING_2B   
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##    BATTING_3B       BATTING_HR       BATTING_BB      BATTING_SO    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##    BASERUN_SB      BASERUN_CS     BATTING_HBP      PITCHING_H   
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00   Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50   1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00   Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36   Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00   3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00   Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                   
##   PITCHING_HR     PITCHING_BB      PITCHING_SO        FIELDING_E    
##  Min.   :  0.0   Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0   1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0   Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7   Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0   3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0   Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                   NA's   :102                       
##   FIELDING_DP   
##  Min.   : 52.0  
##  1st Qu.:131.0  
##  Median :149.0  
##  Mean   :146.4  
##  3rd Qu.:164.0  
##  Max.   :228.0  
##  NA's   :286
summary(dfEval)
##      INDEX      TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :   9   Min.   : 819   Min.   : 44.0   Min.   : 14.00  
##  1st Qu.: 708   1st Qu.:1387   1st Qu.:210.0   1st Qu.: 35.00  
##  Median :1249   Median :1455   Median :239.0   Median : 52.00  
##  Mean   :1264   Mean   :1469   Mean   :241.3   Mean   : 55.91  
##  3rd Qu.:1832   3rd Qu.:1548   3rd Qu.:278.5   3rd Qu.: 72.00  
##  Max.   :2525   Max.   :2170   Max.   :376.0   Max.   :155.00  
##                                                                
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   : 15.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 44.50   1st Qu.:436.5   1st Qu.: 545.0   1st Qu.: 59.0  
##  Median :101.00   Median :509.0   Median : 686.0   Median : 92.0  
##  Mean   : 95.63   Mean   :499.0   Mean   : 709.3   Mean   :123.7  
##  3rd Qu.:135.50   3rd Qu.:565.5   3rd Qu.: 912.0   3rd Qu.:151.8  
##  Max.   :242.00   Max.   :792.0   Max.   :1268.0   Max.   :580.0  
##                                   NA's   :18       NA's   :13     
##  TEAM_BASERUN_CS  TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.00   Min.   :42.00    Min.   : 1155   Min.   :  0.0   
##  1st Qu.: 38.00   1st Qu.:53.50    1st Qu.: 1426   1st Qu.: 52.0   
##  Median : 49.50   Median :62.00    Median : 1515   Median :104.0   
##  Mean   : 52.32   Mean   :62.37    Mean   : 1813   Mean   :102.1   
##  3rd Qu.: 63.00   3rd Qu.:67.50    3rd Qu.: 1681   3rd Qu.:142.5   
##  Max.   :154.00   Max.   :96.00    Max.   :22768   Max.   :336.0   
##  NA's   :87       NA's   :240                                      
##  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   : 136.0   Min.   :   0.0   Min.   :  73.0   Min.   : 69.0   
##  1st Qu.: 471.0   1st Qu.: 613.0   1st Qu.: 131.0   1st Qu.:131.0   
##  Median : 526.0   Median : 745.0   Median : 163.0   Median :148.0   
##  Mean   : 552.4   Mean   : 799.7   Mean   : 249.7   Mean   :146.1   
##  3rd Qu.: 606.5   3rd Qu.: 938.0   3rd Qu.: 252.0   3rd Qu.:164.0   
##  Max.   :2008.0   Max.   :9963.0   Max.   :1568.0   Max.   :204.0   
##                   NA's   :18                        NA's   :31

Initial Exploration

We begin with an initial exporation of the dataset.

dim(dfTrain)
## [1] 2276   17
summary(dfTrain)
##      INDEX         TARGET_WINS       BATTING_H      BATTING_2B   
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##    BATTING_3B       BATTING_HR       BATTING_BB      BATTING_SO    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##    BASERUN_SB      BASERUN_CS     BATTING_HBP      PITCHING_H   
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00   Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50   1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00   Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36   Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00   3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00   Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                   
##   PITCHING_HR     PITCHING_BB      PITCHING_SO        FIELDING_E    
##  Min.   :  0.0   Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0   1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0   Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7   Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0   3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0   Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                   NA's   :102                       
##   FIELDING_DP   
##  Min.   : 52.0  
##  1st Qu.:131.0  
##  Median :149.0  
##  Mean   :146.4  
##  3rd Qu.:164.0  
##  Max.   :228.0  
##  NA's   :286
skim(dfTrain)
Data summary
Name dfTrain
Number of rows 2276
Number of columns 17
_______________________
Column type frequency:
numeric 17
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INDEX 0 1.00 1268.46 736.35 1 630.75 1270.5 1915.50 2535 ▇▇▇▇▇
TARGET_WINS 0 1.00 80.79 15.75 0 71.00 82.0 92.00 146 ▁▁▇▅▁
BATTING_H 0 1.00 1469.27 144.59 891 1383.00 1454.0 1537.25 2554 ▁▇▂▁▁
BATTING_2B 0 1.00 241.25 46.80 69 208.00 238.0 273.00 458 ▁▆▇▂▁
BATTING_3B 0 1.00 55.25 27.94 0 34.00 47.0 72.00 223 ▇▇▂▁▁
BATTING_HR 0 1.00 99.61 60.55 0 42.00 102.0 147.00 264 ▇▆▇▅▁
BATTING_BB 0 1.00 501.56 122.67 0 451.00 512.0 580.00 878 ▁▁▇▇▁
BATTING_SO 102 0.96 735.61 248.53 0 548.00 750.0 930.00 1399 ▁▆▇▇▁
BASERUN_SB 131 0.94 124.76 87.79 0 66.00 101.0 156.00 697 ▇▃▁▁▁
BASERUN_CS 772 0.66 52.80 22.96 0 38.00 49.0 62.00 201 ▃▇▁▁▁
BATTING_HBP 2085 0.08 59.36 12.97 29 50.50 58.0 67.00 95 ▂▇▇▅▁
PITCHING_H 0 1.00 1779.21 1406.84 1137 1419.00 1518.0 1682.50 30132 ▇▁▁▁▁
PITCHING_HR 0 1.00 105.70 61.30 0 50.00 107.0 150.00 343 ▇▇▆▁▁
PITCHING_BB 0 1.00 553.01 166.36 0 476.00 536.5 611.00 3645 ▇▁▁▁▁
PITCHING_SO 102 0.96 817.73 553.09 0 615.00 813.5 968.00 19278 ▇▁▁▁▁
FIELDING_E 0 1.00 246.48 227.77 65 127.00 159.0 249.25 1898 ▇▁▁▁▁
FIELDING_DP 286 0.87 146.39 26.23 52 131.00 149.0 164.00 228 ▁▂▇▆▁
str(dfTrain)
## 'data.frame':    2276 obs. of  17 variables:
##  $ INDEX      : int  1 2 3 4 5 6 7 8 11 12 ...
##  $ TARGET_WINS: int  39 70 86 70 82 75 80 85 86 76 ...
##  $ BATTING_H  : int  1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
##  $ BATTING_2B : int  194 219 232 209 186 200 179 171 197 213 ...
##  $ BATTING_3B : int  39 22 35 38 27 36 54 37 40 18 ...
##  $ BATTING_HR : int  13 190 137 96 102 92 122 115 114 96 ...
##  $ BATTING_BB : int  143 685 602 451 472 443 525 456 447 441 ...
##  $ BATTING_SO : int  842 1075 917 922 920 973 1062 1027 922 827 ...
##  $ BASERUN_SB : int  NA 37 46 43 49 107 80 40 69 72 ...
##  $ BASERUN_CS : int  NA 28 27 30 39 59 54 36 27 34 ...
##  $ BATTING_HBP: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PITCHING_H : int  9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
##  $ PITCHING_HR: int  84 191 137 97 102 92 122 116 114 96 ...
##  $ PITCHING_BB: int  927 689 602 454 472 443 525 459 447 441 ...
##  $ PITCHING_SO: int  5456 1082 917 928 920 973 1062 1033 922 827 ...
##  $ FIELDING_E : int  1011 193 175 164 138 123 136 112 127 131 ...
##  $ FIELDING_DP: int  NA 155 153 156 168 149 186 136 169 159 ...

Taking care of NA

Batting_HPBA has too many so we remove it:

dfTrain2 <- dfTrain2 %>%
  dplyr::select(-BATTING_HBP) 

Before we impute the values for NAs, we need to ensure there isn’t any kind of grouping effect for the records with NA. Fact that several columns have the same number of missings suggests there might be. So first we look to see if the missings are correlated:

dfTrain2 <- dfTrain2 %>%
  mutate(Missing_Flag = ifelse(is.na(BATTING_SO),1,0))

dfTrain3 <- dfTrain2 %>%
  dplyr::filter(Missing_Flag == 0) %>%
  dplyr::select(BATTING_SO, PITCHING_SO, BASERUN_CS, BASERUN_SB) 
  

summary(dfTrain3)
##    BATTING_SO      PITCHING_SO        BASERUN_CS      BASERUN_SB   
##  Min.   :   0.0   Min.   :    0.0   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.: 548.0   1st Qu.:  615.0   1st Qu.: 38.0   1st Qu.: 65.0  
##  Median : 750.0   Median :  813.5   Median : 49.0   Median : 98.0  
##  Mean   : 735.6   Mean   :  817.7   Mean   : 52.8   Mean   :120.8  
##  3rd Qu.: 930.0   3rd Qu.:  968.0   3rd Qu.: 62.0   3rd Qu.:147.0  
##  Max.   :1399.0   Max.   :19278.0   Max.   :201.0   Max.   :697.0  
##                                     NA's   :670     NA's   :131

There is some cohort effect as there is complete duplication with pitching so and batting so, and some overlap with baserun cs. Now lets impute the mean/median and see how well the new model performs vs the old:

dfTrain_ImputedMedian <- data.frame(
    sapply(dfTrain2, function(x) ifelse(is.na(x), median(x, na.rm = TRUE), x)))

dfTrain_ImputedMean <- data.frame(
    sapply(dfTrain2, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x)))

dfTrain_ImputedMean_NoCohort <- dfTrain_ImputedMean %>%
  filter(Missing_Flag==0) %>%
  dplyr::select(-Missing_Flag)

dfTrain_ImputedMedian_NoCohort <- dfTrain_ImputedMedian %>%
  filter(Missing_Flag==0) %>%
  dplyr::select(-Missing_Flag)

m1 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMedian)
m2 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMedian_NoCohort)
m3 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMean)
m4 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMean_NoCohort)

summary(m1)$adj.r.squared
## [1] 0.313437
summary(m2)$adj.r.squared
## [1] 0.3147084
summary(m3)$adj.r.squared
## [1] 0.3169625
summary(m4)$adj.r.squared
## [1] 0.3178529

There appears to be a minor effect. Imputing the mean to the other columns with NA and removing cohort records has a very small positive effect on the model.

Now we can look at interactions between the “cohort” and other variables:

EHExplore_Interactions_Scatterplots <- function(df, y, interaction) {

library(ggsci)
  
plot_list <- list()

df$Missing_Flag <- as.factor(df$Missing_Flag)

for(i in 1:ncol(df)) {                              
  p <- eval(substitute(ggplot(df, aes_string(df[ , i], y, color=interaction)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  #scale_color_brewer(palette="Set1") +
  xlab("") +
  theme(title = element_text(size=7)) +
  scale_color_d3()+
  scale_fill_d3()+
  ggtitle(colnames(df)[i]), list(i=i)))
  plot_list[[i]] <- p 
}
  return(plot_list)
}

library(patchwork)

dfTmp <- dfTrain_ImputedMean %>%
  mutate(Pitch_h_Under1500 = as.factor(ifelse(PITCHING_H<=1500, 1, 0)))

z1 <- EHExplore_Interactions_Scatterplots(dfTrain_ImputedMean, "TARGET_WINS", "Missing_Flag")

grid.arrange(grobs=z1[c(2:7)],  ncol=2, nrow=3)

grid.arrange(grobs=z1[c(8:13)],  ncol=2, nrow=3)

grid.arrange(grobs=z1[c(14:16)],  ncol=2, nrow=3)

The interaction analysis suggests that the cohort is not random - there are numerous interactions with many other variables, some of which are quite counterinutitive (team pitching H). So we could either do a random effects/flag/interactions or toss them. Becuase bad data is not reproducible I will toss, at the expense of better predicitons if I can identify the cohort in the eval data.

No we can look at the stats in ur new dataset.

summary(dfTrain_ImputedMean_NoCohort)
##      INDEX         TARGET_WINS       BATTING_H      BATTING_2B   
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 640.2   1st Qu.: 71.00   1st Qu.:1389   1st Qu.:211.2  
##  Median :1275.5   Median : 82.00   Median :1458   Median :240.0  
##  Mean   :1275.2   Mean   : 80.76   Mean   :1475   Mean   :243.9  
##  3rd Qu.:1923.8   3rd Qu.: 91.00   3rd Qu.:1541   3rd Qu.:275.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##    BATTING_3B       BATTING_HR      BATTING_BB      BATTING_SO    
##  Min.   :  0.00   Min.   :  0.0   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 48.0   1st Qu.:456.0   1st Qu.: 548.0  
##  Median : 46.00   Median :107.0   Median :517.0   Median : 750.0  
##  Mean   : 54.45   Mean   :103.4   Mean   :505.1   Mean   : 735.6  
##  3rd Qu.: 71.00   3rd Qu.:148.0   3rd Qu.:582.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.0   Max.   :878.0   Max.   :1399.0  
##    BASERUN_SB      BASERUN_CS      PITCHING_H     PITCHING_HR   
##  Min.   :  0.0   Min.   :  0.0   Min.   : 1137   Min.   :  0.0  
##  1st Qu.: 66.0   1st Qu.: 44.0   1st Qu.: 1425   1st Qu.: 58.0  
##  Median :102.0   Median : 52.8   Median : 1521   Median :111.0  
##  Mean   :121.1   Mean   : 52.8   Mean   : 1794   Mean   :109.7  
##  3rd Qu.:143.8   3rd Qu.: 55.0   3rd Qu.: 1694   3rd Qu.:152.8  
##  Max.   :697.0   Max.   :201.0   Max.   :30132   Max.   :343.0  
##   PITCHING_BB      PITCHING_SO        FIELDING_E      FIELDING_DP   
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0  
##  1st Qu.: 479.2   1st Qu.:  615.0   1st Qu.: 126.0   1st Qu.:137.0  
##  Median : 542.0   Median :  813.5   Median : 155.0   Median :146.4  
##  Mean   : 557.5   Mean   :  817.7   Mean   : 243.9   Mean   :148.6  
##  3rd Qu.: 614.8   3rd Qu.:  968.0   3rd Qu.: 234.0   3rd Qu.:162.0  
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0

Now we can do Outlier Analysis, and check to see if zeroes may be coded as nas

EHExplore_Outliers_Boxplots <- function(df, size="small")
{
  
s <- 6
if (size=="large") {
  s <- 10
}
plot_list2 <- list()

for(i in 1:ncol(df)) {     
  
  qp <- toString(head(sort(df[,i]),5))
  qz <- toString(tail(sort(df[,i]),5))
  qk <- str_c(qp, "\\\n", qz)
  
  p <- eval(substitute(ggplot(df, aes(df[,i])) +
          coord_flip() +  
          xlab(colnames(df)[i])  +
          ylab(qk) +
          theme(axis.title.x = element_text(size = s), axis.title.y = element_text(size = 9), axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray2", color="darkslategray"))  + 
          geom_boxplot(), list(i=i)))
  plot_list2[[i]] <- p 
}
return (plot_list2)
}

z <- EHExplore_Outliers_Boxplots(dfTrain_ImputedMean_NoCohort, "small")
wrap_plots(z)

EHExplore_Distributions_Histograms <- function(df, size = "small")
{
s <- 6
if (size=="large") {
  s <- 10
}

plot_list2 <- list()

for(i in 1:ncol(df)) {     
  
  qp <- toString(head(sort(df[,i]),5))
  qz <- toString(tail(sort(df[,i]),5))
  qk <- str_c("L: ", qp, "\\\n", "H: ", qz)
  
  p <- eval(substitute(ggplot(df, aes(df[,i])) +
          ylab(colnames(df)[i])  +
          xlab(qk) +
          theme(axis.title.x = element_text(size = s), axis.title.y = element_text(size = 9), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_text(size=8),  panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_blank(), panel.background = element_rect(fill = "slategray2", color="darkslategray"))  + 
  geom_histogram(bins=100, fill="white", aes(y = stat(density))) +
      geom_density(col = "red"), list(i=i)))
  plot_list2[[i]] <- p 
}
return (plot_list2)
}

z6 <- EHExplore_Distributions_Histograms(dfTrain_ImputedMean_NoCohort, "small")
wrap_plots(z6)

zz1 <- list()

for(i in 1:length(z)) {
zz1[i*2-1] <- z[i]
zz1[i*2] <- z6[i]
}

grid.arrange(grobs=zz1[c(1:16)],  ncol=4, nrow=4)

grid.arrange(grobs=zz1[c(17:32)],  ncol=4, nrow=4)

There are 4 categories where 0s may be nas: Pitching and Batting HR and Pitching and batting SO. We look more closely at these categories:

dfTrain_ZeroAsNA <- dfTrain %>%
dplyr::select(PITCHING_SO, PITCHING_HR, BATTING_SO, BATTING_HR)

hist(dfTrain_ZeroAsNA)

We can check to see if the zeroes behave like nas or actual values. We compare the interaction with Pitching_h in both cases. They behave very differently, neither like the overall sample:

dfTmp <- dfTrain_ImputedMean %>%
  mutate(Zeros = as.factor(ifelse(PITCHING_SO <= 0, 1, 0)))

z2 <- EHExplore_Interactions_Scatterplots(dfTmp, "TARGET_WINS", "Zeros")

grid.arrange(z2[[11]], z1[[11]],  ncol=2, nrow=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Looking for other gorups, Hard to say - there seems to be something about lower so being more negatively correlated with wins than later - but the ns may be small:

dfTmp <- dfTrain_ImputedMean %>%
  mutate(Zeros = as.factor(ifelse(PITCHING_SO <= 400 & PITCHING_SO >=0, 1, 0)))

dfx <- dfTmp %>%
  filter(Zeros==1)

z2 <- EHExplore_Interactions_Scatterplots(dfTmp, "TARGET_WINS", "Zeros")

grid.arrange(z2[[11]], z1[[11]],  ncol=2, nrow=3)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Will do nothing with outliers or na as zero for now

Second Exploration

a. Dependent variable

hist(dfTrain$TARGET_WINS, bins=20)
## Warning in plot.window(xlim, ylim, "", ...): "bins" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "bins"
## is not a graphical parameter
## Warning in axis(1, ...): "bins" is not a graphical parameter
## Warning in axis(2, ...): "bins" is not a graphical parameter

head(sort(dfTrain$TARGET_WINS))
## [1]  0 12 14 17 21 22
dfTrain_ZeroWins <- dfTrain %>%
  dplyr::filter(TARGET_WINS ==0)

head(dfTrain_ZeroWins, 1)
##   INDEX TARGET_WINS BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB
## 1  1347           0       891        135          0          0          0
##   BATTING_SO BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H PITCHING_HR
## 1          0          0          0          NA      24057           0
##   PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
## 1           0           0       1890          NA

Target_Wins appears normally distributed - the zero is suspicious but I’m going to leave it.

b. Look at correlations throughout the variables and inspect multi-colinnearity

dfCor <- as.data.frame(cor(dfTrain_ImputedMean_NoCohort))
dfCor
##                     INDEX TARGET_WINS   BATTING_H   BATTING_2B  BATTING_3B
## INDEX        1.0000000000 -0.02928140 -0.03131390 -0.003976934 -0.00497585
## TARGET_WINS -0.0292813985  1.00000000  0.39476995  0.293205037  0.13685882
## BATTING_H   -0.0313139014  0.39476995  1.00000000  0.540648272  0.45802046
## BATTING_2B  -0.0039769341  0.29320504  0.54064827  1.000000000 -0.08532550
## BATTING_3B  -0.0049758496  0.13685882  0.45802046 -0.085325497  1.00000000
## BATTING_HR   0.0413809930  0.19059035 -0.06194956  0.393641975 -0.63765753
## BATTING_BB  -0.0358540809  0.23250609 -0.10545406  0.230196649 -0.28160593
## BATTING_SO   0.0814501106 -0.03175071 -0.46385357  0.162685188 -0.66978119
## BASERUN_SB   0.0435154747  0.11143414  0.14886129 -0.153728585  0.49301668
## BASERUN_CS   0.0004632733  0.01610843  0.01198251 -0.077632602  0.19833581
## PITCHING_H   0.0146890757 -0.11576530  0.29979491  0.008872511  0.20396690
## PITCHING_HR  0.0403725584  0.20531868  0.02082589  0.412455481 -0.56629509
## PITCHING_BB -0.0233549401  0.12063924  0.07067846  0.149565361  0.01294580
## PITCHING_SO  0.0558901457 -0.07843609 -0.25265679  0.064792315 -0.25881893
## FIELDING_E  -0.0068738726 -0.17639551  0.28252119 -0.232247607  0.51354615
## FIELDING_DP  0.0061318975 -0.02860414  0.04535652  0.178563220 -0.21908499
##              BATTING_HR  BATTING_BB  BATTING_SO  BASERUN_SB    BASERUN_CS
## INDEX        0.04138099 -0.03585408  0.08145011  0.04351547  0.0004632733
## TARGET_WINS  0.19059035  0.23250609 -0.03175071  0.11143414  0.0161084320
## BATTING_H   -0.06194956 -0.10545406 -0.46385357  0.14886129  0.0119825143
## BATTING_2B   0.39364197  0.23019665  0.16268519 -0.15372858 -0.0776326024
## BATTING_3B  -0.63765753 -0.28160593 -0.66978119  0.49301668  0.1983358054
## BATTING_HR   1.00000000  0.50439692  0.72706935 -0.39942181 -0.3034743273
## BATTING_BB   0.50439692  1.00000000  0.37975087 -0.06545891 -0.0861202523
## BATTING_SO   0.72706935  0.37975087  1.00000000 -0.23837153 -0.1566149092
## BASERUN_SB  -0.39942181 -0.06545891 -0.23837153  1.00000000  0.2869124889
## BASERUN_CS  -0.30347433 -0.08612025 -0.15661491  0.28691249  1.0000000000
## PITCHING_H  -0.27656010 -0.46585690 -0.37568637  0.07198568 -0.0369545996
## PITCHING_HR  0.96659392  0.44681242  0.66717889 -0.36564098 -0.3034478040
## PITCHING_BB  0.10677385  0.47385394  0.03700514  0.14323815 -0.0542531880
## PITCHING_SO  0.18470756 -0.02075682  0.41623330 -0.05615058 -0.0686217842
## FIELDING_E  -0.59891151 -0.66138116 -0.58466444  0.36999309  0.0236201201
## FIELDING_DP  0.33368751  0.32158157  0.14599850 -0.24957358 -0.1563091914
##               PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO   FIELDING_E
## INDEX        0.014689076  0.04037256 -0.02335494  0.05589015 -0.006873873
## TARGET_WINS -0.115765302  0.20531868  0.12063924 -0.07843609 -0.176395507
## BATTING_H    0.299794910  0.02082589  0.07067846 -0.25265679  0.282521195
## BATTING_2B   0.008872511  0.41245548  0.14956536  0.06479231 -0.232247607
## BATTING_3B   0.203966905 -0.56629509  0.01294580 -0.25881893  0.513546149
## BATTING_HR  -0.276560100  0.96659392  0.10677385  0.18470756 -0.598911507
## BATTING_BB  -0.465856896  0.44681242  0.47385394 -0.02075682 -0.661381160
## BATTING_SO  -0.375686369  0.66717889  0.03700514  0.41623330 -0.584664436
## BASERUN_SB   0.071985680 -0.36564098  0.14323815 -0.05615058  0.369993094
## BASERUN_CS  -0.036954600 -0.30344780 -0.05425319 -0.06862178  0.023620120
## PITCHING_H   1.000000000 -0.16448724  0.31845282  0.26724807  0.672838853
## PITCHING_HR -0.164487236  1.00000000  0.19575531  0.20588053 -0.501758136
## PITCHING_BB  0.318452818  0.19575531  1.00000000  0.48849865 -0.016375919
## PITCHING_SO  0.267248074  0.20588053  0.48849865  1.00000000 -0.023291783
## FIELDING_E   0.672838853 -0.50175814 -0.01637592 -0.02329178  1.000000000
## FIELDING_DP -0.088957308  0.32336753  0.15211734  0.01039232 -0.257897297
##              FIELDING_DP
## INDEX        0.006131897
## TARGET_WINS -0.028604138
## BATTING_H    0.045356517
## BATTING_2B   0.178563220
## BATTING_3B  -0.219084985
## BATTING_HR   0.333687510
## BATTING_BB   0.321581568
## BATTING_SO   0.145998500
## BASERUN_SB  -0.249573580
## BASERUN_CS  -0.156309191
## PITCHING_H  -0.088957308
## PITCHING_HR  0.323367525
## PITCHING_BB  0.152117341
## PITCHING_SO  0.010392318
## FIELDING_E  -0.257897297
## FIELDING_DP  1.000000000
heatmap(as.matrix(dfCor), Rowv = NA, Colv = NA)   

Invsteigate suspicious HR category

cor.test(dfTrain$PITCHING_HR, dfTrain$TARGET_WINS)
## 
##  Pearson's product-moment correlation
## 
## data:  dfTrain$PITCHING_HR and dfTrain$TARGET_WINS
## t = 9.1789, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1490846 0.2283275
## sample estimates:
##       cor 
## 0.1890137
ggplot(dfTrain, aes(PITCHING_HR, BATTING_HR, color=INDEX)) +
  geom_point()

hist(dfTrain$PITCHING_HR, breaks=100)

plot(dfTrain$PITCHING_HR, dfTrain$TARGET_WINS)

m1 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfTrain)
summary(m1)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.657  -9.956   0.636  10.055  67.477 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 75.656920   0.646540 117.018   <2e-16 ***
## PITCHING_HR  0.048572   0.005292   9.179   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2274 degrees of freedom
## Multiple R-squared:  0.03573,    Adjusted R-squared:  0.0353 
## F-statistic: 84.25 on 1 and 2274 DF,  p-value: < 2.2e-16
plot(m1)

library(car) 
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:psych':
## 
##     logit
influencePlot(m1, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##        StudRes          Hat        CookD
## 299   4.380293 0.0006944747 0.0066141630
## 832   0.173993 0.0070267976 0.0001071615
## 964  -1.050146 0.0058117315 0.0032231964
## 1211 -4.919225 0.0017463018 0.0209523937
## 2233 -4.132563 0.0017463018 0.0148329515
dfTrain2 <- dfTrain[-c(1211,2233,299,1825, 832), ]
cor.test(dfTrain2$PITCHING_HR, dfTrain2$TARGET_WINS)
## 
##  Pearson's product-moment correlation
## 
## data:  dfTrain2$PITCHING_HR and dfTrain2$TARGET_WINS
## t = 8.8525, df = 2269, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1426547 0.2221771
## sample estimates:
##       cor 
## 0.1827147
m2 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfTrain2)
summary(m2)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfTrain2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.949  -9.929   0.614  10.028  55.992 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 75.948820   0.639361 118.789   <2e-16 ***
## PITCHING_HR  0.046356   0.005237   8.852   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.23 on 2269 degrees of freedom
## Multiple R-squared:  0.03338,    Adjusted R-squared:  0.03296 
## F-statistic: 78.37 on 1 and 2269 DF,  p-value: < 2.2e-16
plot(m2)

library(car) 
influencePlot(m2, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##        StudRes          Hat       CookD
## 964  -1.039523 0.0058683344 0.003189286
## 982  -3.886600 0.0017628831 0.013255859
## 1810  2.114722 0.0049482791 0.011102505
## 1882 -1.303158 0.0058683344 0.005010737
## 2012  3.688318 0.0006272236 0.004245374
summary(m1$residuals)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -75.6569  -9.9562   0.6359   0.0000  10.0552  67.4774
describe(m1$residuals)
##    vars    n mean    sd median trimmed   mad    min   max  range  skew kurtosis
## X1    1 2276    0 15.47   0.64     0.2 14.84 -75.66 67.48 143.13 -0.18     0.86
##      se
## X1 0.32
dfTrain$Residuals <- m1$residuals
dfTrain$Fitted <- m1$fitted.values
library(tidyverse)
dfTrain_WithoutHR <- dfTrain %>%
  dplyr::filter(TARGET_WINS >=50 | PITCHING_HR!=0)

hist(dfTrain_WithoutHR$PITCHING_HR)

plot(dfTrain_WithoutHR$PITCHING_HR, dfTrain_WithoutHR$TARGET_WINS)

m3 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfTrain_WithoutHR)
summary(m3)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfTrain_WithoutHR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.208  -9.802   0.653   9.952  66.914 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 76.624136   0.636539 120.376  < 2e-16 ***
## PITCHING_HR  0.041723   0.005197   8.028 1.58e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.09 on 2263 degrees of freedom
## Multiple R-squared:  0.02769,    Adjusted R-squared:  0.02726 
## F-statistic: 64.45 on 1 and 2263 DF,  p-value: 1.576e-15
plot(m3)

library(car) 
influencePlot(m3, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##         StudRes          Hat        CookD
## 299   4.4557422 0.0007060697 0.0069560265
## 829   0.2703629 0.0070966028 0.0002613277
## 856  -3.7394216 0.0014507753 0.0101000850
## 961  -0.9956483 0.0058665293 0.0029249611
## 1804  2.1826581 0.0049451007 0.0118181032
dfTrain_BiModal <- dfTrain %>%
  mutate(HR_Low = if_else(PITCHING_HR<50,1,0)) %>%
  mutate(HR_High = if_else(PITCHING_HR>=50,1,0))

dfCor_BiModal <- as.data.frame(cor(dfTrain_BiModal))
m4 <- lm(TARGET_WINS ~ PITCHING_HR + HR_Low, data=dfTrain_BiModal)
summary(m4)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR + HR_Low, data = dfTrain_BiModal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.692  -9.976   0.653  10.058  67.556 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 75.529253   1.069339  70.632  < 2e-16 ***
## PITCHING_HR  0.049398   0.007641   6.465 1.24e-10 ***
## HR_Low       0.162504   1.084033   0.150    0.881    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2273 degrees of freedom
## Multiple R-squared:  0.03574,    Adjusted R-squared:  0.03489 
## F-statistic: 42.12 on 2 and 2273 DF,  p-value: < 2.2e-16
plot(m4)

dfHighHR <- dfTrain_BiModal %>%
  dplyr::filter(HR_High ==1)

dfLowHR <- dfTrain_BiModal %>%
  dplyr::filter(HR_Low==1)

t.test(dfLowHR$TARGET_WINS, dfHighHR$TARGET_WINS)
## 
##  Welch Two Sample t-test
## 
## data:  dfLowHR$TARGET_WINS and dfHighHR$TARGET_WINS
## t = -5.4141, df = 753, p-value = 8.291e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.665804 -3.118167
## sample estimates:
## mean of x mean of y 
##  77.11327  82.00526
m5 <- lm(TARGET_WINS ~ PITCHING_HR, data=dfHighHR)
summary(m5)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_HR, data = dfHighHR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.641  -9.293   0.650   9.127  67.238 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 76.107959   0.957161  79.514  < 2e-16 ***
## PITCHING_HR  0.044983   0.006848   6.569 6.72e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.73 on 1709 degrees of freedom
## Multiple R-squared:  0.02463,    Adjusted R-squared:  0.02405 
## F-statistic: 43.15 on 1 and 1709 DF,  p-value: 6.72e-11
plot(m5)

dfCor_HR <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$PITCHING_HR)) 
dfCor_Low <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$HR_Low))

plot(dfTrain$BATTING_HR, dfTrain$PITCHING_HR)

dfTrain$HR_Diff <- dfTrain$PITCHING_HR -dfTrain$BATTING_HR
hist(dfTrain$HR_Diff, breaks=100)

describe(dfTrain$HR_Diff)
##    vars    n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 2276 6.09 15.1      2    2.93 2.97  -2 249   251 6.98    71.83 0.32

Sum of HR allowed greatly exceeds sum of HR hit

m6 <- lm(dfTrain$BATTING_HR ~ dfTrain$PITCHING_HR)
summary(m6)
## 
## Call:
## lm(formula = dfTrain$BATTING_HR ~ dfTrain$PITCHING_HR)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -234.609    0.123    1.336    6.992   12.817 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.592392   0.621547  -2.562   0.0105 *  
## dfTrain$PITCHING_HR  0.957481   0.005087 188.217   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.87 on 2274 degrees of freedom
## Multiple R-squared:  0.9397, Adjusted R-squared:  0.9397 
## F-statistic: 3.543e+04 on 1 and 2274 DF,  p-value: < 2.2e-16
plot(m6)

cor.test(dfTrain$BATTING_BB, dfTrain$PITCHING_BB)
## 
##  Pearson's product-moment correlation
## 
## data:  dfTrain$BATTING_BB and dfTrain$PITCHING_BB
## t = 26.759, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4574724 0.5199930
## sample estimates:
##       cor 
## 0.4893613
plot(dfTrain$BATTING_BB, dfTrain$PITCHING_BB)

  1. look at relationships with Dependent variable
dfTrain_ImputedMedian <- dfTrain_ImputedMean_NoCohort

for(i in 2:ncol(dfTrain_ImputedMedian)) {                              
  print(ggplot(dfTrain_ImputedMedian, aes(x = dfTrain_ImputedMedian[ ,i], y = dfTrain_ImputedMedian$TARGET_WINS)) +
          xlab(colnames(dfTrain)[i])  +
          stat_smooth(method=loess) +
          geom_point())

m <- lm(dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[ ,i])

par(mfcol=c(2,2))
print(summary(m))
print(plot(m))
}
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning in summary.lm(m): essentially perfect fit: summary may be unreliable

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.647e-14 -1.120e-15 -7.000e-16 -2.800e-16  1.614e-12 
## 
## Coefficients:
##                             Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)                1.756e-13  3.928e-15 4.470e+01   <2e-16 ***
## dfTrain_ImputedMedian[, i] 1.000e+00  4.775e-17 2.094e+16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.467e-14 on 2172 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 4.385e+32 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.761  -8.515   0.971   9.783  43.230 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                17.686332   3.164963   5.588 2.58e-08 ***
## dfTrain_ImputedMedian[, i]  0.042775   0.002136  20.025  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.31 on 2172 degrees of freedom
## Multiple R-squared:  0.1558, Adjusted R-squared:  0.1555 
## F-statistic:   401 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.863  -9.376   0.670  10.121  57.415 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                56.346919   1.737969   32.42   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.100118   0.007005   14.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.89 on 2172 degrees of freedom
## Multiple R-squared:  0.08597,    Adjusted R-squared:  0.08555 
## F-statistic: 204.3 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -76.628  -8.980   1.143  10.428  60.940 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                76.62804    0.72265 106.038  < 2e-16 ***
## dfTrain_ImputedMedian[, i]  0.07596    0.01180   6.439 1.48e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.43 on 2172 degrees of freedom
## Multiple R-squared:  0.01873,    Adjusted R-squared:  0.01828 
## F-statistic: 41.46 on 1 and 2172 DF,  p-value: 1.477e-10

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.596  -9.734   0.553  10.041  68.954 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                75.595947   0.658670 114.771   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.050009   0.005527   9.048   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.29 on 2172 degrees of freedom
## Multiple R-squared:  0.03632,    Adjusted R-squared:  0.03588 
## F-statistic: 81.87 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.936  -9.554   0.579   9.674  78.185 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                65.935670   1.370076   48.13   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.029358   0.002635   11.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.15 on 2172 degrees of freedom
## Multiple R-squared:  0.05406,    Adjusted R-squared:  0.05362 
## F-statistic: 124.1 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.228  -9.308   0.963  10.609  63.772 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                82.228036   1.043434   78.81   <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.001990   0.001344   -1.48    0.139    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.001008,   Adjusted R-squared:  0.0005482 
## F-statistic: 2.192 on 1 and 2172 DF,  p-value: 0.1389

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.284  -9.080   1.024  10.198  65.160 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                78.28444    0.57917 135.166  < 2e-16 ***
## dfTrain_ImputedMedian[, i]  0.02048    0.00392   5.226  1.9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.48 on 2172 degrees of freedom
## Multiple R-squared:  0.01242,    Adjusted R-squared:  0.01196 
## F-statistic: 27.31 on 1 and 2172 DF,  p-value: 1.899e-07

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.071  -9.493   1.233  10.483  65.236 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                80.07067    0.98260  81.489   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.01314    0.01750   0.751    0.453    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.0002595,  Adjusted R-squared:  -0.0002008 
## F-statistic: 0.5637 on 1 and 2172 DF,  p-value: 0.4528

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.165  -9.462   0.897  10.651  68.914 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                83.0150688  0.5308401 156.384  < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0012543  0.0002309  -5.432  6.2e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared:  0.0134, Adjusted R-squared:  0.01295 
## F-statistic:  29.5 on 1 and 2172 DF,  p-value: 6.205e-08

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.906  -9.846   0.705   9.965  67.942 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                74.905514   0.682649 109.728   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.053432   0.005465   9.777   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.25 on 2172 degrees of freedom
## Multiple R-squared:  0.04216,    Adjusted R-squared:  0.04171 
## F-statistic: 95.59 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.528  -9.251   0.948  10.415  70.006 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                74.528116   1.149967  64.809  < 2e-16 ***
## dfTrain_ImputedMedian[, i]  0.011187   0.001975   5.664 1.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2172 degrees of freedom
## Multiple R-squared:  0.01455,    Adjusted R-squared:  0.0141 
## F-statistic: 32.08 on 1 and 2172 DF,  p-value: 1.678e-08

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.570  -9.402   0.970  10.484  63.430 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                82.5704787  0.5945630 138.876  < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0022085  0.0006023  -3.667 0.000252 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.53 on 2172 degrees of freedom
## Multiple R-squared:  0.006152,   Adjusted R-squared:  0.005695 
## F-statistic: 13.45 on 1 and 2172 DF,  p-value: 0.0002515

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.638  -9.847   0.708  10.050  73.590 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                83.645750   0.476605 175.503   <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.011815   0.001415  -8.352   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared:  0.03112,    Adjusted R-squared:  0.03067 
## F-statistic: 69.75 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.809  -9.322   1.075  10.459  65.191 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                83.70498    2.23001  37.536   <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.01979    0.01484  -1.334    0.182    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.0008182,  Adjusted R-squared:  0.0003582 
## F-statistic: 1.779 on 1 and 2172 DF,  p-value: 0.1825

## NULL
dfTrain_ImputedMedian <- dfTrain_ImputedMean_NoCohort

EHExplore_Correlations_Scatterplots <- function(df, y)
{
  plot_list <- list()
  
  for(i in 1:ncol(df)) {
  
    ct <- cor.test(df[,i], df[,y])
    
  xText <- str_c("Correlation: ", round(ct$estimate,2), "   p value: ", round(ct$p.value,2))
    
  p <- eval(substitute(ggplot(df, aes_string(x=df[[i]], y)) +
  geom_point(fill="navy", color="white") +
  geom_smooth(method = "loess", color="red", fill="lightcoral") +
  ylab(y) +
    xlab(xText) +
    theme(title = element_text(size=9), axis.title.x = element_text(size = 8), axis.title.y = element_text(size = 9), axis.text.x = element_text(size = 8), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x=element_blank(), panel.grid.minor.y=element_blank(), panel.grid.major.y=element_line(color="gray"), panel.background = element_rect(fill = "slategray2", color="darkslategray")) +
  ggtitle(colnames(df)[i]), list(i=i)))
  plot_list[[i]] <- p 
}
  return(plot_list)
}

z4 <- EHExplore_Correlations_Scatterplots(dfTrain_ImputedMedian, "TARGET_WINS")

grid.arrange(grobs=z4[c(2:11)],  ncol=3, nrow=5)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#grid.arrange(grobs=z4[c(11:16)],  ncol=3, nrow=6)
zz2 <- list()

for(i in 1:length(z)) {
zz1[i*3-2] <- z[i]
zz1[i*3-1] <- z6[i]
zz1[i*3] <- z4[i]
}

grid.arrange(grobs=zz1[c(1:24)],  ncol=3, nrow=8)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

grid.arrange(grobs=zz1[c(25:48)],  ncol=3, nrow=8)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

#grid.arrange(grobs=zz1[c(25:36)],  ncol=3, nrow=4)
#grid.arrange(grobs=zz1[c(37:48)],  ncol=3, nrow=4)

Trying a transformation on team fielding error. it improves it to some degree.

dfTrain_ImputedMedian2 <- dfTrain_ImputedMedian %>%
  mutate(sq = FIELDING_E^2)

summary(lm(TARGET_WINS ~ FIELDING_E, dfTrain_ImputedMedian2))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E, data = dfTrain_ImputedMedian2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.638  -9.847   0.708  10.050  73.590 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83.645750   0.476605 175.503   <2e-16 ***
## FIELDING_E  -0.011815   0.001415  -8.352   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared:  0.03112,    Adjusted R-squared:  0.03067 
## F-statistic: 69.75 on 1 and 2172 DF,  p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ FIELDING_E + sq, dfTrain_ImputedMedian2))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E + sq, data = dfTrain_ImputedMedian2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.981  -9.787   0.647  10.285  72.647 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.985e+01  7.178e-01 111.246  < 2e-16 ***
## FIELDING_E   1.386e-02  3.924e-03   3.533 0.000419 ***
## sq          -2.177e-05  3.108e-06  -7.005 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.17 on 2171 degrees of freedom
## Multiple R-squared:  0.05253,    Adjusted R-squared:  0.05165 
## F-statistic: 60.18 on 2 and 2171 DF,  p-value: < 2.2e-16

Regression

#Two mods made - team pitching has the square temr and intreaction between hits and dp

par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
summary(mod_2)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.264  -8.466   0.163   8.273  58.924 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.9560970  5.4876280   4.365 1.33e-05 ***
## INDEX       -0.0004771  0.0003788  -1.259 0.207988    
## BATTING_H    0.0482928  0.0037112  13.013  < 2e-16 ***
## BATTING_2B  -0.0232530  0.0092311  -2.519 0.011841 *  
## BATTING_3B   0.0595670  0.0169134   3.522 0.000437 ***
## BATTING_HR   0.0655424  0.0272468   2.406 0.016234 *  
## BATTING_BB   0.0084691  0.0057882   1.463 0.143567    
## BATTING_SO  -0.0100510  0.0025721  -3.908 9.61e-05 ***
## BASERUN_SB   0.0254437  0.0044746   5.686 1.47e-08 ***
## BASERUN_CS   0.0006521  0.0161429   0.040 0.967780    
## PITCHING_H  -0.0009865  0.0003651  -2.702 0.006949 ** 
## PITCHING_HR  0.0116273  0.0240289   0.484 0.628514    
## PITCHING_BB  0.0014808  0.0040999   0.361 0.718000    
## PITCHING_SO  0.0028141  0.0009069   3.103 0.001941 ** 
## FIELDING_E  -0.0186779  0.0024906  -7.499 9.31e-14 ***
## FIELDING_DP -0.1091373  0.0136377  -8.003 1.97e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.86 on 2158 degrees of freedom
## Multiple R-squared:  0.3226, Adjusted R-squared:  0.3179 
## F-statistic:  68.5 on 15 and 2158 DF,  p-value: < 2.2e-16
plot(mod_2)

library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + 
##     BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + PITCHING_H + 
##     PITCHING_SO + FIELDING_E + FIELDING_DP, data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.153  -8.411   0.176   8.307  58.465 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.6861348  5.2806294   4.296 1.82e-05 ***
## BATTING_H    0.0486089  0.0036841  13.194  < 2e-16 ***
## BATTING_2B  -0.0233877  0.0092203  -2.537 0.011265 *  
## BATTING_3B   0.0602198  0.0166990   3.606 0.000318 ***
## BATTING_HR   0.0770786  0.0097715   7.888 4.83e-15 ***
## BATTING_BB   0.0104799  0.0033563   3.122 0.001817 ** 
## BATTING_SO  -0.0104007  0.0024834  -4.188 2.93e-05 ***
## BASERUN_SB   0.0253857  0.0042813   5.929 3.53e-09 ***
## PITCHING_H  -0.0008928  0.0003178  -2.809 0.005008 ** 
## PITCHING_SO  0.0030690  0.0006625   4.633 3.82e-06 ***
## FIELDING_E  -0.0184139  0.0024107  -7.639 3.28e-14 ***
## FIELDING_DP -0.1095211  0.0136173  -8.043 1.43e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.86 on 2162 degrees of freedom
## Multiple R-squared:  0.3218, Adjusted R-squared:  0.3184 
## F-statistic: 93.27 on 11 and 2162 DF,  p-value: < 2.2e-16

Understanding the role of double plays - remove the influence of hits:

ggplot(dfTrain_ImputedMedian, aes(FIELDING_DP, PITCHING_H)) +
  geom_point()

ggplot(dfTrain, aes(FIELDING_DP, PITCHING_H)) +
  geom_point()
## Warning: Removed 286 rows containing missing values (geom_point).

cor(dfTrain_ImputedMedian$FIELDING_DP, dfTrain_ImputedMedian$PITCHING_H)
## [1] -0.08895731
summary(lm(TARGET_WINS ~ FIELDING_DP + PITCHING_H, dfTrain))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP + PITCHING_H, data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -66.999  -9.102   0.739  10.013  43.146 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 75.0829610  2.4592867  30.530  < 2e-16 ***
## FIELDING_DP -0.0045343  0.0121655  -0.373    0.709    
## PITCHING_H   0.0041845  0.0008319   5.030 5.34e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.85 on 1987 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.01377,    Adjusted R-squared:  0.01278 
## F-statistic: 13.87 on 2 and 1987 DF,  p-value: 1.038e-06
summary(lm(TARGET_WINS ~ FIELDING_DP + PITCHING_H, dfTrain_ImputedMedian))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP + PITCHING_H, data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.237  -9.564   0.855  10.359  68.964 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 87.1139149  2.2975487  37.916  < 2e-16 ***
## FIELDING_DP -0.0271240  0.0147930  -1.834   0.0669 .  
## PITCHING_H  -0.0012921  0.0002317  -5.576 2.76e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2171 degrees of freedom
## Multiple R-squared:  0.01493,    Adjusted R-squared:  0.01402 
## F-statistic: 16.45 on 2 and 2171 DF,  p-value: 8.127e-08
summary(lm(TARGET_WINS ~ FIELDING_DP*PITCHING_H, dfTrain))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP * PITCHING_H, data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.126  -9.261   1.004   9.713  47.202 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.023e+02  5.724e+00  17.872  < 2e-16 ***
## FIELDING_DP            -2.549e-01  4.914e-02  -5.188 2.35e-07 ***
## PITCHING_H             -1.244e-02  3.269e-03  -3.806 0.000145 ***
## FIELDING_DP:PITCHING_H  1.561e-04  2.970e-05   5.257 1.62e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.76 on 1986 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.02731,    Adjusted R-squared:  0.02584 
## F-statistic: 18.59 on 3 and 1986 DF,  p-value: 6.864e-12
summary(lm(TARGET_WINS ~ FIELDING_DP*PITCHING_H, dfTrain_ImputedMedian))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_DP * PITCHING_H, data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.162  -9.515   0.820  10.312  69.257 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             7.833e+01  5.757e+00  13.607   <2e-16 ***
## FIELDING_DP             3.302e-02  3.906e-02   0.845   0.3981    
## PITCHING_H              3.513e-03  2.898e-03   1.212   0.2256    
## FIELDING_DP:PITCHING_H -3.328e-05  2.001e-05  -1.663   0.0964 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2170 degrees of freedom
## Multiple R-squared:  0.01618,    Adjusted R-squared:  0.01482 
## F-statistic:  11.9 on 3 and 2170 DF,  p-value: 9.984e-08

The interaction temr makes a difference.

Taking a log of Pitching_H:

ggplot(dfTrain_ImputedMedian, aes(dfTrain_ImputedMedian$PITCHING_H)) +
  geom_histogram(bins=100)
## Warning: Use of `dfTrain_ImputedMedian$PITCHING_H` is discouraged. Use
## `PITCHING_H` instead.

dfTrain_ImputedMedian5 <- dfTrain_ImputedMedian2 %>%
  mutate(logPitch_h = PITCHING_H^2)

ggplot(dfTrain_ImputedMedian5, aes(logPitch_h, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ PITCHING_H + logPitch_h, dfTrain_ImputedMedian5)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H + logPitch_h, data = dfTrain_ImputedMedian5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.631  -9.694   1.045  10.242  64.174 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.944e+01  9.013e-01  88.133  < 2e-16 ***
## PITCHING_H   1.126e-03  5.376e-04   2.094   0.0364 *  
## logPitch_h  -1.313e-07  2.682e-08  -4.897 1.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.39 on 2171 degrees of freedom
## Multiple R-squared:  0.02418,    Adjusted R-squared:  0.02328 
## F-statistic:  26.9 on 2 and 2171 DF,  p-value: 2.895e-12
plot(m)

A closer look at Pitching_h. Taking out th outliers.

dfTrain_ImputedMedian6 <- dfTrain_ImputedMedian5 %>%
  dplyr::filter(PITCHING_H <= 1500)

dfTrain_ImputedMedian7 <- dfTrain_ImputedMedian5 %>%
  dplyr::filter(PITCHING_H > 2000)

ggplot(dfTrain_ImputedMedian6, aes(PITCHING_H, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian6)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.864  -8.396   0.413   8.870  30.267 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.26774    8.46728   0.976    0.329    
## PITCHING_H   0.04990    0.00602   8.289 3.78e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.22 on 970 degrees of freedom
## Multiple R-squared:  0.06614,    Adjusted R-squared:  0.06518 
## F-statistic:  68.7 on 1 and 970 DF,  p-value: 3.785e-16
plot(m)

ggplot(dfTrain_ImputedMedian7, aes(PITCHING_H, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian7)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.879 -13.887   2.392  15.885  65.947 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 90.487384   2.180193   41.50  < 2e-16 ***
## PITCHING_H  -0.002207   0.000418   -5.28 2.77e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.87 on 255 degrees of freedom
## Multiple R-squared:  0.09855,    Adjusted R-squared:  0.09502 
## F-statistic: 27.88 on 1 and 255 DF,  p-value: 2.767e-07
plot(m)

ggplot(dfTrain_ImputedMedian, aes(PITCHING_H, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.165  -9.462   0.897  10.651  68.914 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83.0150688  0.5308401 156.384  < 2e-16 ***
## PITCHING_H  -0.0012543  0.0002309  -5.432  6.2e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared:  0.0134, Adjusted R-squared:  0.01295 
## F-statistic:  29.5 on 1 and 2172 DF,  p-value: 6.205e-08
plot(m)

Eliminting outliers has no effect - but show outliers seem to be grouped (compare new outliers with old):

dfTrain_ImputedMedian_nooutliers <- dfTrain_ImputedMedian %>%
  dplyr::filter(INDEX != 1211 & INDEX != 1342 & INDEX != 1810)

m <- lm(TARGET_WINS ~ PITCHING_H, dfTrain_ImputedMedian_nooutliers)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H, data = dfTrain_ImputedMedian_nooutliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.170  -9.460   0.889  10.636  68.905 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83.0181857  0.5306250  156.45  < 2e-16 ***
## PITCHING_H  -0.0012530  0.0002307   -5.43 6.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2169 degrees of freedom
## Multiple R-squared:  0.01341,    Adjusted R-squared:  0.01296 
## F-statistic: 29.49 on 1 and 2169 DF,  p-value: 6.263e-08
plot(m)

looking for interactions:

par(mfcol=c(2,2))

dfTrain_ImputedMedian8 <- dfTrain_ImputedMedian %>%
  mutate(Pitch_h_Under1500 = ifelse(PITCHING_H<=1500, 1, 0))

dfTrain_ImputedMedian8$Pitch_h_Under1500 <- as.factor(dfTrain_ImputedMedian8$Pitch_h_Under1500)

for(i in 2:ncol(dfTrain_ImputedMedian8)) {                              
  print(ggplot(dfTrain_ImputedMedian8, aes(dfTrain_ImputedMedian8[,i], TARGET_WINS, color=Pitch_h_Under1500)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  ggtitle(colnames(dfTrain_ImputedMedian8)[i]))
}
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

Similar analysis with the data missing records:

dfTrain_flag <- dfTrain2 %>%
  mutate(Missing_Flag = ifelse(is.na(BATTING_SO),1,0))

mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_flag)
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_HBP + PITCHING_HR + 
##     PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP, data = dfTrain_flag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2248  -5.6294  -0.0212   5.0439  21.3065 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.95454   19.10292   3.191 0.001670 ** 
## BATTING_H    0.02541    0.01009   2.518 0.012648 *  
## BATTING_HBP  0.08712    0.04852   1.796 0.074211 .  
## PITCHING_HR  0.08945    0.02394   3.736 0.000249 ***
## PITCHING_BB  0.05672    0.00940   6.034 8.66e-09 ***
## PITCHING_SO -0.03136    0.00728  -4.308 2.68e-05 ***
## FIELDING_E  -0.17218    0.03970  -4.338 2.38e-05 ***
## FIELDING_DP -0.11904    0.03516  -3.386 0.000869 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.422 on 183 degrees of freedom
##   (2080 observations deleted due to missingness)
## Multiple R-squared:  0.5345, Adjusted R-squared:  0.5167 
## F-statistic: 30.02 on 7 and 183 DF,  p-value: < 2.2e-16

Only interaction appears with the fielding_errors. Hwoever, If we interact with itself it greatly improves the r squared.

dfTrain_ImputedMedian9 <- dfTrain_ImputedMedian8 %>%
  mutate(Pitch_h_squared = PITCHING_H^2) %>%
    mutate(Pitch_h_log = log(PITCHING_H)) %>%
    mutate(Pitch_h_sqrt = sqrt(PITCHING_H))

summary(lm(TARGET_WINS ~ Pitch_h_squared, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_squared, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.015  -9.069   0.997  10.158  66.609 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.119e+01  3.359e-01 241.736  < 2e-16 ***
## Pitch_h_squared -8.054e-08  1.147e-08  -7.024 2.88e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.4 on 2172 degrees of freedom
## Multiple R-squared:  0.02221,    Adjusted R-squared:  0.02176 
## F-statistic: 49.33 on 1 and 2172 DF,  p-value: 2.883e-12
summary(lm(TARGET_WINS ~ Pitch_h_log, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_log, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.408  -9.582   1.145  10.356  66.161 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  87.2807     7.9389  10.994   <2e-16 ***
## Pitch_h_log  -0.8795     1.0706  -0.822    0.411    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.0003106,  Adjusted R-squared:  -0.0001496 
## F-statistic: 0.6749 on 1 and 2172 DF,  p-value: 0.4114
summary(lm(TARGET_WINS ~ Pitch_h_sqrt, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_sqrt, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.753  -9.477   0.982  10.732  68.378 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  85.48013    1.47144   58.09  < 2e-16 ***
## Pitch_h_sqrt -0.11429    0.03474   -3.29  0.00102 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.54 on 2172 degrees of freedom
## Multiple R-squared:  0.00496,    Adjusted R-squared:  0.004501 
## F-statistic: 10.83 on 1 and 2172 DF,  p-value: 0.001017
m <- lm(TARGET_WINS ~ PITCHING_H*Pitch_h_Under1500, dfTrain_ImputedMedian8)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ PITCHING_H * Pitch_h_Under1500, data = dfTrain_ImputedMedian8)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.864  -9.153   0.979   9.772  67.940 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    8.643e+01  6.550e-01 131.965  < 2e-16 ***
## PITCHING_H                    -1.771e-03  2.322e-04  -7.628 3.55e-14 ***
## Pitch_h_Under15001            -7.816e+01  1.047e+01  -7.466 1.19e-13 ***
## PITCHING_H:Pitch_h_Under15001  5.167e-02  7.432e-03   6.952 4.76e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.08 on 2170 degrees of freedom
## Multiple R-squared:  0.06361,    Adjusted R-squared:  0.06232 
## F-statistic: 49.14 on 3 and 2170 DF,  p-value: < 2.2e-16
plot(m)

summary(lm(TARGET_WINS ~ FIELDING_E*Pitch_h_Under1500, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E * Pitch_h_Under1500, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.182  -9.571   0.598   9.826  73.499 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   87.867745   0.643380 136.572  < 2e-16 ***
## FIELDING_E                    -0.016158   0.001498 -10.787  < 2e-16 ***
## Pitch_h_Under15001            -0.776515   1.469068  -0.529    0.597    
## FIELDING_E:Pitch_h_Under15001 -0.042078   0.008364  -5.031 5.28e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.88 on 2170 degrees of freedom
## Multiple R-squared:  0.08892,    Adjusted R-squared:  0.08766 
## F-statistic: 70.59 on 3 and 2170 DF,  p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ FIELDING_E, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ FIELDING_E, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.638  -9.847   0.708  10.050  73.590 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 83.645750   0.476605 175.503   <2e-16 ***
## FIELDING_E  -0.011815   0.001415  -8.352   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared:  0.03112,    Adjusted R-squared:  0.03067 
## F-statistic: 69.75 on 1 and 2172 DF,  p-value: < 2.2e-16

Final Mods:

dfTrain_ImputedMedian8$Pitch_h_Under1500 <- as.numeric(dfTrain_ImputedMedian8$Pitch_h_Under1500)

dfTrain_Final <- dfTrain_ImputedMean_NoCohort %>%
  mutate(Pitch_h_Under1500 = ifelse(PITCHING_H<=1500, 1, 0)) %>%
  mutate(Prod_DP_H = FIELDING_DP*PITCHING_H) %>%
  mutate(inter_H_Itself = PITCHING_H*Pitch_h_Under1500) %>%
  mutate(Inter_H_Err = FIELDING_E*Pitch_h_Under1500) %>%
  mutate(PITCHING_H = PITCHING_H) %>%
  mutate(E_sq = FIELDING_E^2) %>%
  mutate(BB_sq = -1*BATTING_BB^2) %>%
  mutate(BHR_sq = -1*BATTING_HR^2) %>%
  mutate(BSO_sq = -1*BATTING_SO^2) %>%
  mutate(PH_sq = -1*PITCHING_H^2) %>%
  mutate(PSO_sq = -PITCHING_SO^2) 

dfTrain_ImputedMean$Missing_Flag <- as.numeric(dfTrain_ImputedMean$Missing_Flag)

dfTrain_Final2 <- dfTrain_ImputedMean %>%
  mutate(Pitch_h_Under1500 = ifelse(PITCHING_H<=1500, 1, 0)) %>%
  mutate(Prod_DP_H = FIELDING_DP*PITCHING_H) %>%
  mutate(inter_H_Itself = PITCHING_H*Pitch_h_Under1500) %>%
  mutate(Inter_H_Err = FIELDING_E*Pitch_h_Under1500) %>%
  mutate(E_sq = FIELDING_E^2) %>%
  mutate(BB_sq = -1*BATTING_BB^2) %>%
  mutate(BHR_sq = -1*BATTING_HR^2) %>%
  mutate(BSO_sq = -1*BATTING_SO^2) %>%
  mutate(PH_sq = -1*PITCHING_H^2) %>%
  mutate(PSO_sq = -PITCHING_SO^2) %>%
  mutate(Inter_h_Cohort = PITCHING_H*Missing_Flag) %>%
  mutate(Inter_bb_Cohort = PITCHING_BB*Missing_Flag) %>%
  mutate(Inter_hr_Cohort = PITCHING_HR*Missing_Flag) %>%
  mutate(Inter_E_Cohort = FIELDING_E*Missing_Flag) %>%
  mutate(Inter_bh_Cohort = BATTING_H*Missing_Flag) %>%
  mutate(Inter_bhr_Cohort = BATTING_HR*Missing_Flag) %>%
  mutate(Inter_bbb_Cohort = BATTING_BB*Missing_Flag) %>%
  mutate(Inter_bs_Cohort = BASERUN_SB*Missing_Flag) 

mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMean_NoCohort)
step2 <- stepAIC(mod_2, trace=FALSE)
#summary(step2)

par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_Final)
step3 <- stepAIC(mod_2, trace=FALSE)
#summary(step3)

mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_Final2)
step4 <- stepAIC(mod_2, trace=FALSE)
summary(step4)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + 
##     BATTING_BB + BATTING_SO + BASERUN_SB + PITCHING_H + PITCHING_BB + 
##     FIELDING_E + FIELDING_DP + Missing_Flag + Pitch_h_Under1500 + 
##     inter_H_Itself + Inter_H_Err + BB_sq + BHR_sq + BSO_sq + 
##     PH_sq + Inter_E_Cohort + Inter_bhr_Cohort + Inter_bbb_Cohort + 
##     Inter_bs_Cohort, data = dfTrain_Final2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.224  -7.883   0.383   7.828  58.494 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.862e+01  7.211e+00   6.743 1.96e-11 ***
## BATTING_H          5.651e-02  3.809e-03  14.836  < 2e-16 ***
## BATTING_2B        -1.332e-02  8.793e-03  -1.515 0.129953    
## BATTING_3B         8.266e-02  1.590e-02   5.199 2.18e-07 ***
## BATTING_BB        -2.197e-01  1.816e-02 -12.094  < 2e-16 ***
## BATTING_SO         4.104e-02  7.230e-03   5.676 1.55e-08 ***
## BASERUN_SB         3.979e-02  4.284e-03   9.288  < 2e-16 ***
## PITCHING_H        -3.941e-03  1.044e-03  -3.776 0.000164 ***
## PITCHING_BB        1.909e-02  3.295e-03   5.793 7.87e-09 ***
## FIELDING_E        -3.595e-02  2.968e-03 -12.115  < 2e-16 ***
## FIELDING_DP       -8.629e-02  1.282e-02  -6.729 2.16e-11 ***
## Missing_Flag       2.730e+01  1.156e+01   2.362 0.018238 *  
## Pitch_h_Under1500  3.344e+01  9.425e+00   3.548 0.000396 ***
## inter_H_Itself    -1.719e-02  6.503e-03  -2.643 0.008276 ** 
## Inter_H_Err       -3.658e-02  7.000e-03  -5.226 1.90e-07 ***
## BB_sq             -2.041e-04  1.589e-05 -12.848  < 2e-16 ***
## BHR_sq            -2.731e-04  3.515e-05  -7.767 1.21e-14 ***
## BSO_sq             3.267e-05  4.696e-06   6.957 4.55e-12 ***
## PH_sq             -8.681e-08  3.439e-08  -2.524 0.011660 *  
## Inter_E_Cohort    -1.997e-01  2.698e-02  -7.401 1.89e-13 ***
## Inter_bhr_Cohort   3.348e-01  1.599e-01   2.094 0.036350 *  
## Inter_bbb_Cohort   4.961e-02  1.895e-02   2.618 0.008908 ** 
## Inter_bs_Cohort    4.692e-02  2.717e-02   1.727 0.084256 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.22 on 2253 degrees of freedom
## Multiple R-squared:  0.4041, Adjusted R-squared:  0.3983 
## F-statistic: 69.44 on 22 and 2253 DF,  p-value: < 2.2e-16
summary(step2)$adj.r.squared
## [1] 0.3183679
summary(step3)$adj.r.squared
## [1] 0.3776131
summary(step4)$adj.r.squared
## [1] 0.398272

Checking interactions with the missing vaolues cohort:

looking for interactions:

dfTrain_ImputedMean_NoCohort1 <- dfTrain_ImputedMean_NoCohort %>%
  mutate(BB_sq = -1*BATTING_BB^2)

summary(lm(TARGET_WINS ~ BATTING_BB, dfTrain_ImputedMean_NoCohort1))
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_BB, data = dfTrain_ImputedMean_NoCohort1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.936  -9.554   0.579   9.674  78.185 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 65.935670   1.370076   48.13   <2e-16 ***
## BATTING_BB   0.029358   0.002635   11.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.15 on 2172 degrees of freedom
## Multiple R-squared:  0.05406,    Adjusted R-squared:  0.05362 
## F-statistic: 124.1 on 1 and 2172 DF,  p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ BATTING_BB + BB_sq, dfTrain_ImputedMean_NoCohort1))
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_BB + BB_sq, data = dfTrain_ImputedMean_NoCohort1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.421  -9.315   0.582   9.742  72.271 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.442e+01  2.462e+00  30.229  < 2e-16 ***
## BATTING_BB  -1.398e-02  1.079e-02  -1.296    0.195    
## BB_sq       -4.958e-05  1.197e-05  -4.142 3.58e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.09 on 2171 degrees of freedom
## Multiple R-squared:  0.06147,    Adjusted R-squared:  0.06061 
## F-statistic:  71.1 on 2 and 2171 DF,  p-value: < 2.2e-16

pitching SO has 20 zeroes which looks like missing values. Also, eliminate the 0 wins record.

x <- dfTrain_ImputedMean_NoCohort %>% 
  filter(PITCHING_SO == 0)
x
##    INDEX TARGET_WINS BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB
## 1    325         120      2270        301        132         42         74
## 2    326         146      2305        322        111         29         64
## 3    435          65      1464        147         32          3         94
## 4    459          23      1458        220         35          0         93
## 5    952          77      1895        244          8          8         93
## 6    953          73      1685        206         31          0         58
## 7   1106          49      1794        281         58          6         79
## 8   1107         107      1725        194         67          4         79
## 9   1347           0       891        135          0          0          0
## 10  1498          24      1289        145         41          7         45
## 11  1502         105      1767        249         77         20         95
## 12  1503          71      1491        200         57         17         50
## 13  2037          97      1903        256         50         18         71
## 14  2038         118      2086        280        135         22         89
## 15  2048          81      1927        207        142          8         78
## 16  2049          88      1622        155         67         12         52
## 17  2253          34      1177        171          9          0        119
## 18  2254          93      1527        200         64          0         79
## 19  2486          12      1009        112         75          0         12
## 20  2493          29      1122         69         64          0         29
##    BATTING_SO BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR PITCHING_BB
## 1           0   124.7618   52.80386       5253          97         171
## 2           0   124.7618   52.80386       4727          59         131
## 3           0   124.7618   52.80386       4312           9         277
## 4           0   124.7618   52.80386      16871           0        1076
## 5           0   124.7618   52.80386       5203          22         255
## 6           0   124.7618   52.80386       4074           0         140
## 7           0   124.7618   52.80386       5484          18         241
## 8           0   124.7618   52.80386       3408           8         156
## 9           0     0.0000    0.00000      24057           0           0
## 10          0   124.7618   52.80386       4443          24         155
## 11          0   124.7618   52.80386       4404          50         237
## 12          0   124.7618   52.80386       3552          41         119
## 13          0   124.7618   52.80386       5605          53         209
## 14          0   124.7618   52.80386       4629          49         198
## 15          0   124.7618   52.80386       5382          22         218
## 16          0   124.7618   52.80386       3864          29         124
## 17          0   124.7618   52.80386      10035           0        1015
## 18          0   124.7618   52.80386       3638           0         188
## 19          0   124.7618   52.80386      12574           0         150
## 20          0   124.7618   52.80386       6492           0         168
##    PITCHING_SO FIELDING_E FIELDING_DP
## 1            0       1058    146.3879
## 2            0        951    146.3879
## 3            0       1473    146.3879
## 4            0       1898    146.3879
## 5            0       1225    146.3879
## 6            0        931    146.3879
## 7            0       1531    146.3879
## 8            0        853    146.3879
## 9            0       1890    146.3879
## 10           0       1506    146.3879
## 11           0       1092    146.3879
## 12           0       1253    146.3879
## 13           0       1166    146.3879
## 14           0        928    146.3879
## 15           0       1447    146.3879
## 16           0       1132    146.3879
## 17           0       1279    146.3879
## 18           0       1010    146.3879
## 19           0        847    146.3879
## 20           0       1522    146.3879